It turns out that words like the n-word and TERF are appearing very frequently in the holyoke and smith confessional respectively. We want to dig deeper into how these and other controversial words are being used.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import functools
from os import path
from scipy.ndimage import imread
from nltk.util import ngrams
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from IPython.display import display
import cufflinks as cf
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()
plt.style.use('ggplot')
%matplotlib inline
# Reading in data
holyc_df = pd.read_csv('../tmp/clean/holyokecon_confessional_comments.csv')
holyr_df = pd.read_csv('../tmp/clean/holyokecon_confessional_reports.csv')
holys_df = pd.read_csv('../tmp/clean/holyokecon_confessional_secrets.csv')
holyraw_df = pd.read_csv('../tmp/raw/holyokecon_confessional_secrets.csv')
holyrawr_df = pd.read_csv('../tmp/raw/holyokecon_confessional_reports.csv')
# defining some global variables
SECRET_COL = 'clean_tokens_secret'
REPORT_COL = 'clean_tokens_report'
holysr_df = holys_df.merge(holyr_df, left_on='id', right_on="secret_id",
how='left', suffixes=('_secret', '_report'))
holysr_df = holysr_df.merge(holyraw_df[['id', 'create_date', 'confession']],
left_on='id_secret', right_on='id', how='left')
holysr_df = holysr_df.merge(holyrawr_df[['id', 'reason']],
left_on='id_report', right_on='id', how='left')
holysr_df.rename(columns={'reason': 'report_reason'}, inplace=True)
#preprocess: remove rows with null clean_tokens_secret value
holysr_df = holysr_df[holysr_df[SECRET_COL].notnull()]
holysr_df.head()
# detecting secrets containing a specific word
pattern = r'gay|lesbian|bisex' # niggar|nigger|asian|yellow|latino|white|gay|lesbian|trans|bi
selector = holysr_df[SECRET_COL].str.contains(pattern)
match_df = holysr_df[selector]
# Drop duplicate secrets
match_secrets = match_df.drop_duplicates('id_secret')
# Match not reported
match_not_reported = match_secrets[match_secrets['id_report'].isnull()]
# Match reported
match_reported = match_secrets[match_secrets['id_report'].notnull()]
# Select report text
report_text = match_df[match_df[REPORT_COL].notnull()]
word_cloud_options = {
'width': 800,
'height': 800,
'background_color': "white",
'max_words': 500,
'stopwords': STOPWORDS,
'random_state': 42
}
def create_word_cloud(text_iterable, image_color_fp=None,
title='', **kwargs):
confesh_coloring = imread(image_color_fp)
kwargs.update({'mask': confesh_coloring})
wc = WordCloud(**kwargs)
text = " ".join(text_iterable)
wc.generate(text)
image_colors = ImageColorGenerator(confesh_coloring)
plt.figure(figsize=(8,8))
plt.title(title)
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.show()
logo_fp = '../assets/logo2.png'
# Word Cloud of Match
create_word_cloud(match_secrets[SECRET_COL].astype(str),
logo_fp, title="Holyoke Secrets Containing the word %s" % pattern,
**word_cloud_options)
# Defining functions to compute word frequency
def word_counter(text, n=1, length_thres=50):
t = text.split()
t = [tk for tk in t if len(tk) < length_thres]
for i in range(n):
t_ngrams = [" ".join(b) for b in list(ngrams(t, i + 1))]
t.extend(t_ngrams)
return Counter(t)
def word_aggregater(corpus_list, n=1):
c = Counter()
for doc in corpus_list:
c.update(word_counter(doc, n=n))
return c
def count_token_frequency(token_series, filter_thres, **kwargs):
freq_df = pd.DataFrame(word_aggregater(token_series, **kwargs).items())
freq_df.rename(columns={0: 'word', 1: 'frequency'}, inplace=True)
freq_df = freq_df[freq_df['frequency'] > filter_thres] \
.sort_values('frequency', ascending=False)
freq_df['ngrams'] = freq_df['word'].apply(lambda x: len(x.split()))
return freq_df.reset_index(drop=True)
# create frequency count dataframes
secrets_corpus = count_token_frequency(match_secrets['clean_tokens_secret'], 0, n=3)
secrets_not_reported_corpus = count_token_frequency(match_not_reported['clean_tokens_secret'], 0, n=3)
secrets_reported_corpus = count_token_frequency(match_reported['clean_tokens_secret'], 0, n=3)
report_text_corpus = count_token_frequency(report_text['clean_tokens_secret'], 0, n=3)
# merge frequencies for all secrets, reported, and not reported
merge_cols = ['word', 'frequency']
all_corpus = secrets_corpus.merge(secrets_not_reported_corpus[merge_cols], on="word",
how="left", suffixes=("_all", "_not_reported"))
all_corpus = all_corpus.merge(secrets_reported_corpus[merge_cols], on="word", how="left")
all_corpus = all_corpus.rename(columns={'frequency': 'frequency_reported'})
all_corpus.head()
# sanity check on the word frequency counter:
# since not_reported and reported secrets should
# be mutually exclusive, frequency_all should equal
# the sum of frequency_not_reported and frequency_reported
secret_sum = all_corpus[['frequency_not_reported', 'frequency_reported']].sum(axis=1)
not_equal = all_corpus[~(secret_sum == all_corpus['frequency_all'])]
print not_equal.shape[0]
# creating custom annotations for the plot
# when you hover over a specific bar on the plot,
# you should be able to see the top 4 posts
# containing that word, sorted by number of comments
def format_text_annotation(text_list, n=60):
text_list = [t.decode('utf-8').encode('ascii', 'ignore') for t in text_list]
text_list = [" ".join(t.split()) for t in text_list]
text_list = "<br>".join([t if len(t) < n else t[:n] + "..." for t in text_list])
return text_list
def token_top_secrets(token, comment_col='comments', n=5):
top_secrets = holysr_df[holysr_df[SECRET_COL].str.contains(token)] \
.sort_values(comment_col, ascending=False)['confession']
top_secrets = top_secrets.drop_duplicates().tolist()
if len(top_secrets) < n:
n = len(top_secrets)
return format_text_annotation(top_secrets[:n])
def token_reports_text(token, comment_col='comments', n=5):
top_reports = report_text[report_text[SECRET_COL].str.contains(token)] \
.sort_values(comment_col, ascending=False)['confession']
top_reports = top_reports.drop_duplicates().tolist()
if len(top_reports) < n:
n = len(top_reports)
return format_text_annotation(top_reports[:n])
# filter all_corpus to pick top n tokens for each ngram
n = 25
all_corpus = pd.concat([
all_corpus[all_corpus['ngrams'] == 1].sort_values('frequency_all', ascending=False)[:n],
all_corpus[all_corpus['ngrams'] == 2].sort_values('frequency_all', ascending=False)[:n],
all_corpus[all_corpus['ngrams'] == 3].sort_values('frequency_all', ascending=False)[:n]
])
all_corpus['top_secrets'] = all_corpus['word'].apply(token_top_secrets)
all_corpus['top_reports'] = all_corpus['word'].apply(token_reports_text)
all_corpus.head()
def create_bar_trace(dataframe, graph_obj, x_col, y_col, text_col, **go_kwargs):
return graph_obj(
y=dataframe[x_col],
x=dataframe[y_col],
text=dataframe[text_col],
**go_kwargs)
def create_word_freq_subplot(dataframe, ngrams=1):
dataframe = dataframe[dataframe['ngrams'] == ngrams].copy()
dataframe.sort_values('frequency_all', inplace=True, ascending=False)
trace1 = create_bar_trace(dataframe, go.Bar,
'frequency_not_reported', 'word', 'top_secrets',
name='<b>Not Reported</b>',
marker={'color': '#bc94d3'},
showlegend=False)
trace2 = create_bar_trace(dataframe, go.Bar,
'frequency_reported', 'word', 'top_reports',
name='<b>Reported</b>',
marker={'color': '#8551a3'},
showlegend=False)
data = [trace1, trace2]
return data
def add_subplot_fig(fig, row, col, traces):
for t in traces:
fig.append_trace(t, row, col)
return fig
subplot1 = create_word_freq_subplot(all_corpus, ngrams=1)
subplot2 = create_word_freq_subplot(all_corpus, ngrams=2)
subplot3 = create_word_freq_subplot(all_corpus, ngrams=3)
fig = tools.make_subplots(rows=3, cols=1,
subplot_titles=('Unigrams', 'Bigrams', 'Trigrams'),
vertical_spacing = 0.12)
add_subplot_fig(fig, 1, 1, subplot1)
add_subplot_fig(fig, 2, 1, subplot2)
add_subplot_fig(fig, 3, 1, subplot3)
title = 'Frequency of Words/Phrases in Confessions Containing %s' \
% ", ".join(["\"" + p + "\"" for p in pattern.split('|')])
xaxis_domain = fig['layout']['xaxis1']['domain']
fig['layout'].update(
{
'title': title,
'titlefont': {'size': 20},
'height': 1200,
'width': 1000,
'barmode': 'stack',
'margin': {'l': 100, 'r': 100, 'b': 125, 't': 100, 'pad': 10},
'xaxis1': {
'tickangle': -45
},
'xaxis2': {
'tickangle': -45
},
'xaxis3': {
'tickangle': -45
}
}
)
iplot(fig)